M1 Lab

Matthew Præstgaard

due 01/31/2024

All files can be found at https://github.com/matthewep528/BIOSTLabs

library(tidyverse)
library(readxl)
library(car)
library(plotly)
library(ggthemr)
ggthemr("flat dark")
library(gridExtra)
library(ggh4x)
library(jtools)
library(kableExtra)
library(shiny)

Functions

# use to plot linear model with summary statistics and error
# Usage: ggRegression(data, title = "title" (optional))
ggRegression <- function(fit, title = "title") {
  require(ggplot2)

  ggplot(fit$model, aes_string(x = names(fit$model)[2], y = names(fit$model)[1])) +
    geom_point() +
    stat_smooth(
      method = "lm",
      geom = "smooth"
    ) +
    labs(
      title = title,
      subtitle = paste(
        "Adj R2 = ", signif(summary(fit)$adj.r.squared, 5),
        "Intercept =", signif(fit$coef[[1]], 5),
        " Slope =", signif(fit$coef[[2]], 5),
        " P =", signif(summary(fit)$coef[2, 4], 5)
      )
    )
}
# plotlyRegression: Scatterplot with line, SE, and summary statistics
# requires a model to have been created beforehand
# Usage: plotlyRegression(fit, title = "title" (optional))
plotlyRegression <- function(fit, title = "title") {
  require(plotly)

  # Extract model data
  df <- fit$model
  x_var <- names(df)[2]
  y_var <- names(df)[1]

  # Create a scatter plot
  p <- plot_ly(df, x = ~ df[[x_var]], y = ~ df[[y_var]], type = "scatter", mode = "markers")

  # Get confidence interval data
  confint_df <- data.frame(predict(fit, interval = "confidence"))
  confint_df[[x_var]] <- df[[x_var]] # Adding the x variable to the dataframe

  # Add linear regression line and 95% confidence interval
  p <- p %>%
    add_lines(
      x = confint_df[[x_var]],
      y = confint_df$fit,
      line = list(color = "#0aca8d")
    ) %>%
    add_ribbons(
      x = confint_df[[x_var]],
      ymin = confint_df$lwr,
      ymax = confint_df$upr,
      line = list(color = "transparent"),
      fillcolor = "rgba(0, 100, 255, 0.2)"
    )

  # Create subtitle with summary statistics
  subtitle <- paste(
    "Adj R2 =", signif(summary(fit)$adj.r.squared, 5),
    ", Intercept =", signif(fit$coef[[1]], 5),
    ", Slope =", signif(fit$coef[[2]], 5),
    ", P =", signif(summary(fit)$coef[2, 4], 5)
  )

  # Add layout with combined title and subtitle
  p <- p %>%
    layout(
      title = list(
        text = paste0(title, "<br><sup>", subtitle, "</sup>"),
        font = list(size = 16)
      ),
      plot_bgcolor = "#2c2c2c",
      paper_bgcolor = "#2c2c2c",
      font = list(color = "#ffffff"),
      xaxis = list(title = x_var, color = "#ffffff"),
      yaxis = list(title = y_var, color = "#ffffff")
    )

  return(p)
}
# Function to summarize variables with optional grouping
# Usage: summarize.var(dataset, var, group_var (optional))
summarize.var <- function(dataset, var, group_var = NULL) {
  require(tidyverse)
  require(rlang)

  result <- dataset %>%
    {
      if (!is.null(group_var)) dplyr::group_by({{ group_var }}) else .
    } %>%
    summarise(
      mean = mean({{ var }}, na.rm = TRUE),
      median = median({{ var }}, na.rm = TRUE),
      min = min({{ var }}, na.rm = TRUE),
      max = max({{ var }}, na.rm = TRUE),
      q25 = quantile({{ var }}, 0.25, na.rm = TRUE), # first quartile
      q75 = quantile({{ var }}, 0.75, na.rm = TRUE), # third quartile
      sd = sd({{ var }}, na.rm = TRUE),
      n = length(na.omit({{ var }}))
    ) %>%
    mutate(IQR = q75 - q25)

  return(result)
}

Data Management

setwd("Lab1")
## Error in setwd("Lab1"): cannot change working directory
data <- read_xlsx("hwdata1.xlsx")
kable(head(data), caption = "Table 1.")
Table 1.
HDL cholesterol triglyceride spbl
47 287 111 0
38 236 135 0
47 255 98 0
39 135 63 0
44 121 46 0
64 171 103 0
HDLsum <- summarize.var(data, HDL)
Cholsum <- summarize.var(data, cholesterol)
Trisum <- summarize.var(data, triglyceride)

summary <- rbind(HDLsum, Cholsum, Trisum)
summary$Variable <- c("HDL", "Cholesterol", "Triglyceride")
summary <- summary %>%
  select(Variable, everything()) %>%
  data.frame()
kable(summary)
Variable mean median min max q25 q75 sd n IQR
HDL 47.7619 46.5 27 76 40 56.00 10.60789 42 16.00
Cholesterol 267.8095 263.5 121 397 248 293.25 60.40321 42 45.25
Triglyceride 155.0476 140.0 46 408 103 179.00 73.75039 42 76.00

Separate SLR Models

We will start by fitting separate models to see if any of the thre predictors of interest are associated with the outcome.

Part 1) Fitting SLR models

Fit separate simple linear regression models for assessing the association of total cholesterol level (cholesterol), total triglyceride level (triglyceride) and sinking pre-beta lipoprotein (spbl) with HDL (HDL) (mg/dL). Write the estimated regression equation and interpret the slope coefficient for each of the three models.

# model with just cholesterol
lmChol <- lm(HDL ~ cholesterol, data = data)
# ggRegression(lmChol, title = "Fig 1. Plot of HDL by Cholesterol.")
plotlyRegression(lmChol, title = "Fig 1. Plot of HDL by Cholesterol.")
# model with just triglycerides
lmTri <- lm(HDL ~ triglyceride, data = data)
plotlyRegression(lmTri, title = "Fig 2. Plot of HDL by Triglycerides.")
# ggRegression(lmTri, title = "Fig 2. Plot of HDL by Triglycerides.")

# model with just sinking pre-beta
lmSPBL <- glm(HDL ~ spbl, data = data)
summ(lmSPBL)
Observations 42
Dependent variable HDL
Type Linear regression
χ²(1) 735.21
Pseudo-R² (Cragg-Uhler) 0.16
Pseudo-R² (McFadden) 0.02
AIC 315.26
BIC 320.48
Est. S.E. t val. p
(Intercept) 43.77 2.10 20.85 0.00
spbl 8.38 3.04 2.75 0.01
Standard errors: MLE
## make sure to show the output for the models

Regression models and interpretations here:

Part 2) Hypothesis testing in SLR models

Using the separate models fit in part 1, test whether total cholesterol level, triglyceride level, and sinking pre-beta lipoprotein alone statistically significantly predict y. Interpret your answer.

# show test/model output here

MLR models

We now wish to improve our estimation of HDL by incorporating multiple predictors in the same model. This will also allow us to assess possible confounding and effect modification.

Part 3) MLR model equation

Write the theoretical (population) regression equation that would be used to test whether total cholesterol level, triglyceride level, and sinking pre-beta lipoprotein taken together statistically significantly improve prediction of HDL. Interpret the coefficients in this model.

Note: for the regression equation you may use latex code such as: \[y = \beta_0 + \beta_1x_1 + \epsilon \] or you can just use regular text: y = b0 + b1*x + e

Model:

Interpretations:

Part 4) Fitting a MLR model and performing a global test

Fit the model specified above and test if total cholesterol level, triglyceride level, and sinking pre-beta lipoprotein taken together statistically significantly improve prediction of HDL. Interpret your answer.

# fit MLR model
# REMINDER: syntax for 2 predictors is lm(outcome ~ predictor1 + predictor2, data = data_name)

## make sure to show the output for the model

Part 5) Variable added test

Fit a multivariable model and test whether sinking pre-beta (spbl) is associated with HDL after the combined contribution of cholesterol and triglyceride levels is taken into account (i.e., cholesterol and triglycerides are already in the model). Assume no interactions exist. State the appropriate null hypothesis for this test and interpret the results of the test.

# code for relevant test here (show your output)

Null hypothesis:

Test results and interpretation:

Part 6) Model for effect modification

Suppose we are interested in evaluating if the association between cholesterol and HDL and the association between triglycerides and HDL depends on the absence/presence of SPBL. Write down the model statement for the model you need to fit to assess this and interpret the coefficients in this model.

Model:

Interpretation of coefficients:

Part 7) Testing for effect modification

Fit a multivariable model and test whether the interactions of cholesterol with SPBL and triglycerides with SPBL are simultaneously equal to zero in a model already containing the three main effects for cholesterol, triglycerides, and SPBL. State the null hypothesis of the test. Given the result of the test, what can you conclude about the relationship of HDL to both cholesterol and triglycerides (hint: remember what an interaction means)?

# code to fit model

# test

Null hypothesis:

Interpretation:

Part 8) Checking for confounding

For sinking pre-beta lipoprotein (spbl), compare the coefficient for spbl in the full model (in part 4) to the simple model in parts 1 and 2. Do you think there is confounding due to cholesterol and triglycerides? Explain.

# code to check for confounding

Explanation:

Part 9) Model for different levels of categorical variables

Assume straight line models are appropriate for describing the relationship between HDL and total cholesterol level for the absence and the presence of sinking pre-beta. Write out a single regression model that specifies two separate lines for the relationship between HDL and cholesterol, one for the absence of sinking pre-beta lipoprotein (spbl=0) and one for the presence of sinking pre-beta lipoprotein (spbl=1). (You can omit triglycerides from this model). Run code to fit this model.

Model:

# fit model

## make sure to show the output for the model

Part 10) Interaction vs stratification

Fit separate models for each level of SPBL and compare these results to the results you got in part 9.

# fit model for SPBL = 0

# fit model for SPBL = 1

## make sure to show the output for the models

Explanation of similarities differences compared to part 9 here:

Part 11) Plotting separate lines

Plot the observed data and the fitted lines from part 9 for each value of sinking pre-beta (either 2 separate graphs with the same y axes or combined in one graph).

# plot

Describe what you see:

Part 12) Testing for equality of lines

Fit the model you need to test for coincidence (equality) of the two lines in part 9. Perform the test and interpret.

# test for equality of lines

Interpretation:

Part 13) Testing for parallel lines

Fit the model you need to test for parallelism (equal slopes) of the two lines in part 9. Perform the test and interpret.

# test for parallel lines (equal slopes)

Interpretation

Part 14) Centering variables

Center the cholesterol and triglyceride variables at their mean value and refit the model from part 9. How do the models compare (which coefficients change and which stay the same)? Interpret the intercept in both models.

# center values

# refit models with centered values

## make sure to show the output

Interpretation of intercept in original model:

Interpretation of intercept in new model with centered variables:

Part 15) Scaling varibales

In this dataset, the units of measurement for cholesterol is mg/dL. Suppose we wish to convert this to mmol/L. To do so, we divide the value of cholesterol by 38.67. That is,

\[x \text{ mg/dL} = x \text{ mg/dL}\frac{1 \text{ mmol/L}}{38.67\text{ mg/dL}} =\frac{x}{38.67}\text{ mmol/L}\]

Apply this transformation and compare the simple linear regression model you fit in part 1 for cholesterol to a new model using the (uncentered) cholesterol value measured in mmol/L. What do you notice about the new regression coefficients?

# rescale variable

# refit model and compare to original model

## make sure to show the output

Interpretation:

Session Information

sessionInfo()
## R version 4.3.1 (2023-06-16 ucrt)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 11 x64 (build 25314)
## 
## Matrix products: default
## 
## 
## locale:
## [1] LC_COLLATE=English_United States.1252  LC_CTYPE=English_United States.1252    LC_MONETARY=English_United States.1252 LC_NUMERIC=C                          
## [5] LC_TIME=English_United States.1252    
## 
## time zone: America/New_York
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] rlang_1.1.1      shiny_1.7.5      kableExtra_1.3.4 jtools_2.2.2     ggh4x_0.2.6      gridExtra_2.3    ggthemr_1.1.0    plotly_4.10.2    car_3.1-2        carData_3.0-5   
## [11] readxl_1.4.3     lubridate_1.9.3  forcats_1.0.0    stringr_1.5.0    dplyr_1.1.3      purrr_1.0.2      readr_2.1.4      tidyr_1.3.0      tibble_3.2.1     ggplot2_3.4.3   
## [21] tidyverse_2.0.0  knitr_1.44      
## 
## loaded via a namespace (and not attached):
##  [1] tidyselect_1.2.0  viridisLite_0.4.2 fastmap_1.1.1     lazyeval_0.2.2    promises_1.2.1    digest_0.6.33     timechange_0.2.0  mime_0.12         lifecycle_1.0.3  
## [10] ellipsis_0.3.2    magrittr_2.0.3    compiler_4.3.1    sass_0.4.7        tools_4.3.1       utf8_1.2.3        yaml_2.3.7        data.table_1.14.8 labeling_0.4.3   
## [19] htmlwidgets_1.6.2 xml2_1.3.5        abind_1.4-5       withr_2.5.1       grid_4.3.1        fansi_1.0.5       xtable_1.8-4      colorspace_2.1-0  scales_1.2.1     
## [28] cli_3.6.1         rmarkdown_2.25    crayon_1.5.2      generics_0.1.3    rstudioapi_0.15.0 httr_1.4.7        tzdb_0.4.0        cachem_1.0.8      pander_0.6.5     
## [37] splines_4.3.1     rvest_1.0.3       cellranger_1.1.0  rmdformats_1.0.4  vctrs_0.6.3       Matrix_1.6-1.1    webshot_0.5.5     jsonlite_1.8.7    bookdown_0.35    
## [46] hms_1.1.3         systemfonts_1.0.5 crosstalk_1.2.0   jquerylib_0.1.4   glue_1.6.2        stringi_1.7.12    gtable_0.3.4      later_1.3.1       munsell_0.5.0    
## [55] pillar_1.9.0      htmltools_0.5.6.1 R6_2.5.1          evaluate_0.22     lattice_0.21-9    highr_0.10        memoise_2.0.1     httpuv_1.6.11     bslib_0.5.1      
## [64] Rcpp_1.0.11       svglite_2.1.2     nlme_3.1-163      mgcv_1.9-0        xfun_0.40         pkgconfig_2.0.3